import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor


Data=pd.read_csv("/content/drive/MyDrive/Datasets/AnticipateBuildingConsumptionNeeds/2016_Building_Energy_Benchmarking.csv")


Data.shape

(3376, 46)


print(Data.isnull().sum().sum()," mssing number out of ",Data.isnull().sum().sum()+Data.notna().sum().sum())

19952  mssing number out of  155296


print(np.round((Data.isnull().sum().sum()*100) /((Data.isnull().sum().sum()+Data.notna().sum().sum()))), "% of missing data")

13.0 % of missing data


Data.head(3)


Data.describe(exclude="object")


print(Data.describe(exclude="object").columns.tolist())
print(len(Data.describe(exclude="object").columns.tolist()))

['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
31


print(Data.drop(Data.describe(exclude="object").columns.tolist(),axis=1).columns.tolist())
print(len(Data.drop(Data.describe(exclude="object").columns.tolist(),axis=1).columns.tolist()))

['BuildingType', 'PrimaryPropertyType', 'PropertyName', 'Address', 'City', 'State', 'TaxParcelIdentificationNumber', 'Neighborhood', 'ListOfAllPropertyUseTypes', 'LargestPropertyUseType', 'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType', 'YearsENERGYSTARCertified', 'ComplianceStatus', 'Outlier']
15


Data['BuildingType'].unique()

array(['NonResidential', 'Nonresidential COS', 'Multifamily MR (5-9)',
       'SPS-District K-12', 'Campus', 'Multifamily LR (1-4)',
       'Multifamily HR (10+)', 'Nonresidential WA'], dtype=object)


print("NonResidential:", len(Data[Data['BuildingType'].isin(["NonResidential"])]))

NonResidential: 1460


print("Nonresidential COS:", len(Data[Data['BuildingType'].isin(["Nonresidential COS"])]))

Nonresidential COS: 85


print("Nonresidential WA:", len(Data[Data['BuildingType'].isin(["Nonresidential WA"])]))

Nonresidential WA: 1


DataNR=Data[Data['BuildingType'].isin(["NonResidential","Nonresidential COS","Nonresidential WA"])].copy()


print(DataNR.describe(exclude="object").columns.tolist())

['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity']


DataNR['PrimaryPropertyType'].unique()

array(['Hotel', 'Other', 'Mixed Use Property', 'University',
       'Small- and Mid-Sized Office', 'Self-Storage Facility',
       'Warehouse', 'K-12 School', 'Large Office',
       'Senior Care Community', 'Medical Office', 'Retail Store',
       'Hospital', 'Residence Hall', 'Distribution Center',
       'Worship Facility', 'Supermarket / Grocery Store', 'Laboratory',
       'Refrigerated Warehouse', 'Restaurant', 'Low-Rise Multifamily',
       'Office'], dtype=object)


DataNR['NumberofBuildings'].unique()

array([ 1.,  3.,  0.,  2.,  4.,  6.,  9.,  5., nan,  7.,  8.])


DataNR['PropertyGFATotal'].unique()

array([ 88434, 103566, 956110, ...,  13157,  14101,  18258])


DataNR['Neighborhood'].unique()

array(['DOWNTOWN', 'NORTHEAST', 'EAST', 'LAKE UNION', 'GREATER DUWAMISH',
       'BALLARD', 'NORTHWEST', 'MAGNOLIA / QUEEN ANNE', 'CENTRAL',
       'SOUTHWEST', 'SOUTHEAST', 'NORTH', 'DELRIDGE', 'North', 'Delridge',
       'Ballard', 'Northwest', 'Central', 'DELRIDGE NEIGHBORHOODS'],
      dtype=object)


DataNRCO2E=DataNR[['OSEBuildingID','PrimaryPropertyType','Latitude', 'Longitude','Neighborhood', 'YearBuilt','NumberofBuildings','NumberofFloors','TotalGHGEmissions','SiteEnergyUse(kBtu)','ENERGYSTARScore']].copy()


DataNRCO2E['BuildingAge']=2016-DataNRCO2E['YearBuilt']


DataNRCO2E


print(DataNRCO2E.isnull().sum().sum())
DataNRCO2E.dropna(inplace=True)

546


DataNRCO2E


DataNRCO2E.describe(include="object")


DataNRCO2E.drop(DataNRCO2E[DataNRCO2E["PrimaryPropertyType"].isin(['Low-Rise Multifamily','Residence Hall'])].index,axis=0,inplace=True)


DataNRCO2E.skew()

<ipython-input-27-3b9fa3907873>:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  DataNRCO2E.skew()

OSEBuildingID           0.345459
Latitude                0.247815
Longitude               0.011083
YearBuilt              -0.394848
NumberofBuildings      10.418149
NumberofFloors          4.874361
TotalGHGEmissions      12.471995
SiteEnergyUse(kBtu)     8.861597
ENERGYSTARScore        -0.675025
BuildingAge             0.394848
dtype: float64


DataNRCO2E.hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()


DataNRCO2E['SiteEnergyUse(kBtu)']=np.log1p(DataNRCO2E['SiteEnergyUse(kBtu)'])
sns.distplot(DataNRCO2E['SiteEnergyUse(kBtu)'])

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='Density'>


DataNRCO2E["TotalGHGEmissions"]=np.log1p(DataNRCO2E["TotalGHGEmissions"])
sns.distplot(DataNRCO2E["TotalGHGEmissions"])

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='TotalGHGEmissions', ylabel='Density'>


from numpy import mean, std, absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor


X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["TotalGHGEmissions"]
print(X.shape, y.shape)

(986, 9) (986,)


# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)


col_transform

ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                 Index(['PrimaryPropertyType', 'Neighborhood'], dtype='object')),
                                ('num', StandardScaler(),
                                 array(['Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings',
       'NumberofFloors', 'ENERGYSTARScore', 'BuildingAge'], dtype=object))])


# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T1 = XGBRegressor()


# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T1)])


# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)


# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))

DummyReg MAE: 1.151 (0.091)


# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))

LinearRegression MAE: 0.841 (0.066)


# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))

SVR MAE: 1.005 (0.062)


# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))

XGB MAE: 0.806 (0.088)


# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)

# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))

DummyReg r2: -0.010 (0.010)


# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))

SVR r2: 0.158 (0.128)


# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))

XGB r2: 0.468 (0.082)


from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
    (OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
    remainder='passthrough')

transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())


X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42)


modelXGB_T1.fit(X_train,y_train)

[09:32:27] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRegressor()


ypred=modelXGB_T1.predict(X_test)


modelXGB_T1.score(X_test,ypred)

1.0


pd.DataFrame(np.expm1(modelXGB_T1.predict(transformed_df)),columns=['ypred']).join(np.expm1(y.reset_index(drop=True)))


pd.DataFrame(data=modelXGB_T1.feature_importances_).nlargest(10,0)


pd.DataFrame(transformed_df.columns).T


X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["SiteEnergyUse(kBtu)"]
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)
# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T2 = XGBRegressor()
# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T2)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)


# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))

DummyReg MAE: 1.066 (0.097)
LinearRegression MAE: 0.611 (0.038)
SVR MAE: 0.712 (0.070)
XGB MAE: 0.577 (0.049)


# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))

DummyReg r2: -0.011 (0.012)
SVR r2: 0.454 (0.119)
XGB r2: 0.648 (0.074)


from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
    (OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
    remainder='passthrough')

transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42)
modelXGB_T2.fit(X_train,y_train)
ypred=modelXGB_T2.predict(X_test)
modelXGB_T2.score(X_test,ypred)

[09:32:45] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

1.0


pd.DataFrame(np.expm1(modelXGB_T2.predict(transformed_df)),columns=['ypred']).join(np.expm1(y.reset_index(drop=True)))

	OSEBuildingID	DataYear	BuildingType	PrimaryPropertyType	PropertyName	Address	City	State	ZipCode	TaxParcelIdentificationNumber	...	Electricity(kWh)	Electricity(kBtu)	NaturalGas(therms)	NaturalGas(kBtu)	DefaultData	Comments	ComplianceStatus	Outlier	TotalGHGEmissions	GHGEmissionsIntensity
0	1	2016	NonResidential	Hotel	Mayflower park hotel	405 Olive way	Seattle	WA	98101.0	0659000030	...	1.156514e+06	3946027.0	12764.52930	1276453.0	False	NaN	Compliant	NaN	249.98	2.83
1	2	2016	NonResidential	Hotel	Paramount Hotel	724 Pine street	Seattle	WA	98101.0	0659000220	...	9.504252e+05	3242851.0	51450.81641	5145082.0	False	NaN	Compliant	NaN	295.86	2.86
2	3	2016	NonResidential	Hotel	5673-The Westin Seattle	1900 5th Avenue	Seattle	WA	98101.0	0659000475	...	1.451544e+07	49526664.0	14938.00000	1493800.0	False	NaN	Compliant	NaN	2089.28	2.19

	OSEBuildingID	DataYear	ZipCode	CouncilDistrictCode	Latitude	Longitude	YearBuilt	NumberofBuildings	NumberofFloors	PropertyGFATotal	...	SiteEnergyUseWN(kBtu)	SteamUse(kBtu)	Electricity(kWh)	Electricity(kBtu)	NaturalGas(therms)	NaturalGas(kBtu)	DefaultData	Comments	TotalGHGEmissions	GHGEmissionsIntensity
count	3376.000000	3376.0	3360.000000	3376.000000	3376.000000	3376.000000	3376.000000	3368.000000	3376.000000	3.376000e+03	...	3.370000e+03	3.367000e+03	3.367000e+03	3.367000e+03	3.367000e+03	3.367000e+03	3376	0.0	3367.000000	3367.000000
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	2	NaN	NaN	NaN
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	3263	NaN	NaN	NaN
mean	21208.991114	2016.0	98116.949107	4.439277	47.624033	-122.334795	1968.573164	1.106888	4.709123	9.483354e+04	...	5.276726e+06	2.745959e+05	1.086639e+06	3.707612e+06	1.368505e+04	1.368505e+06	NaN	NaN	119.723971	1.175916
std	12223.757015	0.0	18.615205	2.120625	0.047758	0.027203	33.088156	2.108402	5.494465	2.188376e+05	...	1.593879e+07	3.912173e+06	4.352478e+06	1.485066e+07	6.709781e+04	6.709781e+06	NaN	NaN	538.832227	1.821452
min	1.000000	2016.0	98006.000000	1.000000	47.499170	-122.414250	1900.000000	0.000000	0.000000	1.128500e+04	...	0.000000e+00	0.000000e+00	-3.382680e+04	-1.154170e+05	0.000000e+00	0.000000e+00	NaN	NaN	-0.800000	-0.020000
25%	19990.750000	2016.0	98105.000000	3.000000	47.599860	-122.350662	1948.000000	1.000000	2.000000	2.848700e+04	...	9.701822e+05	0.000000e+00	1.874229e+05	6.394870e+05	0.000000e+00	0.000000e+00	NaN	NaN	9.495000	0.210000
50%	23112.000000	2016.0	98115.000000	4.000000	47.618675	-122.332495	1975.000000	1.000000	4.000000	4.417500e+04	...	1.904452e+06	0.000000e+00	3.451299e+05	1.177583e+06	3.237538e+03	3.237540e+05	NaN	NaN	33.920000	0.610000
75%	25994.250000	2016.0	98122.000000	7.000000	47.657115	-122.319407	1997.000000	1.000000	5.000000	9.099200e+04	...	4.381429e+06	0.000000e+00	8.293178e+05	2.829632e+06	1.189033e+04	1.189034e+06	NaN	NaN	93.940000	1.370000
max	50226.000000	2016.0	98272.000000	7.000000	47.733870	-122.220966	2015.000000	111.000000	99.000000	9.320156e+06	...	4.716139e+08	1.349435e+08	1.925775e+08	6.570744e+08	2.979090e+06	2.979090e+08	NaN	NaN	16870.980000	34.090000

	OSEBuildingID	PrimaryPropertyType	Latitude	Longitude	Neighborhood	YearBuilt	NumberofBuildings	NumberofFloors	TotalGHGEmissions	SiteEnergyUse(kBtu)	ENERGYSTARScore	BuildingAge
0	1	Hotel	47.61220	-122.33799	DOWNTOWN	1927	1.0	12	249.98	7.226362e+06	60.0	89
1	2	Hotel	47.61317	-122.33393	DOWNTOWN	1996	1.0	11	295.86	8.387933e+06	61.0	20
2	3	Hotel	47.61393	-122.33810	DOWNTOWN	1969	1.0	41	2089.28	7.258702e+07	43.0	47
3	5	Hotel	47.61412	-122.33664	DOWNTOWN	1926	1.0	10	286.43	6.794584e+06	56.0	90
4	8	Hotel	47.61375	-122.34047	DOWNTOWN	1980	1.0	18	505.01	1.417261e+07	75.0	36
...	...	...	...	...	...	...	...	...	...	...	...	...
3371	50222	Office	47.56722	-122.31154	GREATER DUWAMISH	1990	1.0	1	20.94	8.497457e+05	46.0	26
3372	50223	Other	47.59625	-122.32283	DOWNTOWN	2004	1.0	1	32.17	9.502762e+05	NaN	12
3373	50224	Other	47.63644	-122.35784	MAGNOLIA / QUEEN ANNE	1974	1.0	1	223.54	5.765898e+06	NaN	42
3374	50225	Mixed Use Property	47.52832	-122.32431	GREATER DUWAMISH	1989	1.0	1	22.11	7.194712e+05	NaN	27
3375	50226	Mixed Use Property	47.53939	-122.29536	GREATER DUWAMISH	1938	1.0	1	41.27	1.152896e+06	NaN	78

	OSEBuildingID	PrimaryPropertyType	Latitude	Longitude	Neighborhood	YearBuilt	NumberofBuildings	NumberofFloors	TotalGHGEmissions	SiteEnergyUse(kBtu)	ENERGYSTARScore	BuildingAge
0	1	Hotel	47.61220	-122.33799	DOWNTOWN	1927	1.0	12	249.98	7.226362e+06	60.0	89
1	2	Hotel	47.61317	-122.33393	DOWNTOWN	1996	1.0	11	295.86	8.387933e+06	61.0	20
2	3	Hotel	47.61393	-122.33810	DOWNTOWN	1969	1.0	41	2089.28	7.258702e+07	43.0	47
3	5	Hotel	47.61412	-122.33664	DOWNTOWN	1926	1.0	10	286.43	6.794584e+06	56.0	90
4	8	Hotel	47.61375	-122.34047	DOWNTOWN	1980	1.0	18	505.01	1.417261e+07	75.0	36
...	...	...	...	...	...	...	...	...	...	...	...	...
3339	50069	Small- and Mid-Sized Office	47.53161	-122.29944	GREATER DUWAMISH	1929	1.0	2	134.80	4.420650e+06	9.0	87
3347	50081	K-12 School	47.58831	-122.30650	GREATER DUWAMISH	2015	1.0	3	9.24	1.325973e+06	77.0	1
3366	50210	Office	47.63572	-122.37525	MAGNOLIA / QUEEN ANNE	1952	1.0	1	3.50	5.026677e+05	75.0	64
3369	50220	Office	47.56440	-122.27813	SOUTHEAST	1960	1.0	1	7.79	3.878100e+05	93.0	56
3371	50222	Office	47.56722	-122.31154	GREATER DUWAMISH	1990	1.0	1	20.94	8.497457e+05	46.0	26

	ypred	TotalGHGEmissions
0	319.701172	249.98
1	304.534607	295.86
2	990.298706	2089.28
3	319.701172	286.43
4	342.030884	505.01
...	...	...
981	26.405626	134.80
982	24.814886	9.24
983	28.213591	3.50
984	22.537758	7.79
985	30.965725	20.94

Objectif:¶

Observation¶

Filtering Non Residential buildings¶

We choose the two following features for energgy and CO2¶

Let's gets as many features as possible to help predict the CO2 compsumption and the energy¶

dataframe with chosen features¶

Feature engineering: let's create features from others¶

Attributes skewed¶

Log transformation for skewed target data¶

Prediction For Target TotalGHGEmissions¶

MAE SCORE¶

R2 Score¶

We choose Xgboost for model( best R2 and MAE scores)¶

Prediction of Target TotalGHGEmissions¶

Energy Star Usefull Or not?¶

Let's use the feature importance¶

Prediction For Target SiteEnergy(kBtu)¶

MAE¶

R2¶

We choose Xgboost for model( best R2 and MAE scores)¶

Prediction of Target SiteEnergyUse(kBtu)¶

	PrimaryPropertyType	Neighborhood
count	1006	1006
unique	18	18
top	Small- and Mid-Sized Office	DOWNTOWN
freq	238	249

	0
38	0.152405
12	0.123078
18	0.082017
13	0.072722
14	0.053095
39	0.043498
11	0.031702
4	0.031545
8	0.031119
0	0.028707

	ypred	SiteEnergyUse(kBtu)
0	9.411165e+06	7.226363e+06
1	1.160892e+07	8.387933e+06
2	3.726919e+07	7.258702e+07
3	9.016754e+06	6.794584e+06
4	1.540815e+07	1.417261e+07
...	...	...
981	1.798050e+06	4.420650e+06
982	2.200798e+06	1.325973e+06
983	1.222920e+06	5.026677e+05
984	7.690450e+05	3.878100e+05
985	1.485874e+06	8.497457e+05